## Generate a sentence using a topic model for
## Nielsen 2019 APSA-MENA newsletter article

## Set your directory here:
mydir <- "C:/Users/Richard Nielsen/Dropbox (MIT)/01_Papers/APSA-MENA text analysis/"
setwd(mydir)

## Load libraries
library(tm)
library(stm)

## Check environment and library versions
sessionInfo()

## R version 3.5.3 (2019-03-11)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18362)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tm_0.7-6  NLP_0.2-0
## 
## loaded via a namespace (and not attached):
## [1] compiler_3.5.3 parallel_3.5.3 Rcpp_1.0.1     slam_0.1-45    xml2_1.2.0

## Read in the data
dat <- readLines("draftForSTM.txt")
dat
## get rid of the empty lines
dat <- dat[!dat==""]
dat

## make the text into a corpus
corp <- Corpus(VectorSource(dat))
## make a document-term matrix
dtm0 <- DocumentTermMatrix(corp, 
                             control = list(tolower=T, stemming = T, stopwords = T, minWordLength = 1,
                              removeNumbers = T, removePunctuation = TRUE))

## Check the resulting dtm
dtm0
dim(dtm0)

## make the text data substantially less sparse
#dtm2 <- removeSparseTerms(dtm0, 0.95)
dtm2 <- dtm0
dim(dtm2)
## some dtm manipulation junk
## make it a data frame
dtm3 <- as.data.frame(as.matrix(dtm2))
## make it integers (for storage reasons)
dtm4 <- apply(dtm3,MAR=2,as.integer)
## make sure all docs still have words left
doclen <- apply(dtm4,1,sum)
min(doclen) ## should be more than 0
## here, need to make dtm4 into dtm
dtm <- dtm4
## make it a data frame
dtm <- data.frame(dtm)
## remove the copies
rm(dtm4,dtm3,dtm2,dtm0)
## look at the results
dim(dtm) #
dtm[1:10,1:10]


## Process for STM
processed <- readCorpus(dtm, type="dtm")
## make it into meta data
out <- prepDocuments(processed$documents, processed$vocab)
## pick a number of topics to use
## I selected three somewhat arbitrarily here.  It's a short piece.
K <- 3
## Estimate the topic model
set.seed(1234);stm.out <- stm(out$documents, out$vocab, K=K,data=out$meta, init.type="LDA")
 
## Label the topics
lab <- labelTopics(stm.out,n=7)
lab <- apply(lab$frex,1,function(x){paste(x,collapse=", ")})
lab

## sample for the paragraph I want to complete
set.seed(12345); topicSamp <- rmultinom(n=1, size=5, prob=stm.out$theta[13,])
topicSamp

## Sample the words
set.seed(12345); wordDraw <- rmultinom(n=1, size=5, prob=exp(stm.out$beta$logbeta[[1]][3,]))
sum(wordDraw)
words <- stm.out$vocab[which(wordDraw==1)]
set.seed(12345);sample(words)





